R Markdown

library(ggplot2)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ggplot2)
library(extracat)
setwd("/Users/divyahindupur/Documents/DS@CU/EDAV/HW/EDAV_project")
data_sample = read.csv('top_complaints_sample500.csv')

Analyzing missing data values on a sample of 500

aggr(data_sample,col=c("gray62","tomato","orange"), prop = F, numbers = T,labels = seq(0,length(names(data_sample))),bars=TRUE,gap=1,border="white",main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

aggr(data_sample,col=c("gray62","tomato","orange"), prop = T, numbers = T,labels = seq(0,length(names(data_sample))),bars=TRUE,gap=1,border="white",main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

visna(data_sample, freqvar = "Freq", tp = FALSE, fr = 1, fc = 1, sort = "n",     sort.method = "count", col = "w", mar.col = c(alpha("black", 0.7), alpha("red", 0.7), "red", "green"),s = Inf, pmax = 1, opts = list(), plot = TRUE,cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

matrixplot(data_sample, interactive = F,col=c("tomato"),labels=seq(0,length(names(data_sample))),main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

We see a lot of missing values in columns ranges between 35 and 50. We try to explore further:

sparse_data = subset(data_sample, select=34:50)

aggr(sparse_data,col=c("gray62","tomato","orange"), prop = F, numbers = T,labels = seq(35,50),gap=1,border="white",bars=TRUE,main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

aggr(sparse_data,col=c("gray62","tomato","orange"), prop = T, numbers = T,labels = seq(35,50),gap=1,border="white",bars=TRUE,main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

The Columns 41 to 50 are fully empty so we discard them.

dense_data = data_sample
colnames= names(data_sample)
empty_columns = c(colnames[c(35,40:50)])
dense_data = data_sample[,!names(data_sample) %in% empty_columns]

And now the data is dense:

matrixplot(dense_data, interactive=F,col=c("tomato"),labels=seq(0,length(names(data_sample))),main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

aggr(dense_data,col=c("gray62","tomato","orange"), prop = T, numbers = T,labels = seq(0,length(names(data_sample))),gap=1,border="white",bars=TRUE,main = "Missing data",xlab="column index",cex.lab=3, cex.axis=4, cex.main=4, cex.sub=4)

Visualizing the entire data

setwd("/Users/divyahindupur/Documents/DS@CU/EDAV/HW/EDAV_project")
all_data=read.csv("Top_5_complaints.csv")

Which agency got the highest number of complaints?

ggplot(all_data,aes(Agency )) + geom_bar(fill= 'RoyalBlue')

ggplot(all_data,aes(Agency )) + geom_bar(fill= 'Tomato') + facet_wrap(~year,ncol = 3)

Which zip code had the highest complaints?

ggplot(all_data,aes(Incident.Zip )) + geom_bar(fill= 'RoyalBlue') + xlim(c(10000,11750))
## Warning: Removed 2396 rows containing non-finite values (stat_count).

Complaint types for Boroughs:

 ggplot(all_data,aes(Complaint.Type )) + geom_bar(fill= 'Tomato') + facet_wrap(~Borough, ncol= 2)

Complain types by years:

 ggplot(all_data,aes(Complaint.Type )) + geom_bar(fill= 'Tomato') + facet_wrap(~year, ncol= 2)